Loading the libraries

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(plotly)
## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout
library(broom)
library(styler)

1 Reading the CSV file into a tibble

gapminder_data <- read.csv("/Users/yasemindilarasucu/Desktop/gapminder_data_ys/gapminder_clean.csv")

Filtering the Data by the year 1962

filtered_data_1962 <- gapminder_data %>%
  filter(Year == 1962)

Creating a Scatter plot

ggplot(filtered_data_1962, aes(x = " CO2 emissions (metric tons per capital) ", y = gdpPercap)) +
  geom_point() +
  labs(
    title = "Scatter Plot of CO2 Emissions vs. GDP per Capita (1962)",
    x = "CO2 Emissions (metric tons per capita)",
    y = "GDP per Capita"
  )
## Warning: Removed 131 rows containing missing values (`geom_point()`).

Checking for the missing values in the dataset

summary(gapminder_data)
##        X          Country.Name            Year     
##  Min.   :   0.0   Length:2607        Min.   :1962  
##  1st Qu.: 651.5   Class :character   1st Qu.:1972  
##  Median :1303.0   Mode  :character   Median :1987  
##  Mean   :1303.0                      Mean   :1985  
##  3rd Qu.:1954.5                      3rd Qu.:1997  
##  Max.   :2606.0                      Max.   :2007  
##                                                    
##  Agriculture..value.added....of.GDP. CO2.emissions..metric.tons.per.capita.
##  Min.   : 0.000                      Min.   : 0.0017                       
##  1st Qu.: 6.745                      1st Qu.: 0.4334                       
##  Median :16.942                      Median : 1.5999                       
##  Mean   :19.992                      Mean   : 4.1472                       
##  3rd Qu.:30.807                      3rd Qu.: 5.6550                       
##  Max.   :94.846                      Max.   :82.7189                       
##  NA's   :1179                        NA's   :414                           
##  Domestic.credit.provided.by.financial.sector....of.GDP.
##  Min.   :-73.66                                         
##  1st Qu.: 21.71                                         
##  Median : 39.17                                         
##  Mean   : 50.06                                         
##  3rd Qu.: 64.57                                         
##  Max.   :301.19                                         
##  NA's   :864                                            
##  Electric.power.consumption..kWh.per.capita.
##  Min.   :    7.61                           
##  1st Qu.:  347.36                           
##  Median : 1216.23                           
##  Mean   : 2750.88                           
##  3rd Qu.: 3735.53                           
##  Max.   :36852.54                           
##  NA's   :1238                               
##  Energy.use..kg.of.oil.equivalent.per.capita.
##  Min.   :    9.72                            
##  1st Qu.:  522.66                            
##  Median : 1034.63                            
##  Mean   : 2131.31                            
##  3rd Qu.: 2835.76                            
##  Max.   :36146.70                            
##  NA's   :1197                                
##  Exports.of.goods.and.services....of.GDP.
##  Min.   :  0.1382                        
##  1st Qu.: 16.3829                        
##  Median : 26.4771                        
##  Mean   : 32.1762                        
##  3rd Qu.: 41.3692                        
##  Max.   :214.7423                        
##  NA's   :798                             
##  Fertility.rate..total..births.per.woman. GDP.growth..annual...
##  Min.   :0.852                            Min.   :-44.900      
##  1st Qu.:2.331                            1st Qu.:  1.486      
##  Median :4.052                            Median :  3.812      
##  Mean   :4.217                            Mean   :  3.997      
##  3rd Qu.:6.097                            3rd Qu.:  6.351      
##  Max.   :8.838                            Max.   :149.973      
##  NA's   :184                              NA's   :691          
##  Imports.of.goods.and.services....of.GDP. Industry..value.added....of.GDP.
##  Min.   :  0.0795                         Min.   :  3.481                 
##  1st Qu.: 20.4815                         1st Qu.: 20.812                 
##  Median : 30.1252                         Median : 29.146                 
##  Mean   : 37.1610                         Mean   : 29.742                 
##  3rd Qu.: 47.6579                         3rd Qu.: 36.274                 
##  Max.   :330.5633                         Max.   :210.607                 
##  NA's   :798                              NA's   :1189                    
##  Inflation..GDP.deflator..annual... Life.expectancy.at.birth..total..years.
##  Min.   : -15.424                   Min.   :19.27                          
##  1st Qu.:   2.775                   1st Qu.:53.96                          
##  Median :   5.700                   Median :65.19                          
##  Mean   :  22.732                   Mean   :62.46                          
##  3rd Qu.:  10.099                   3rd Qu.:71.08                          
##  Max.   :4078.476                   Max.   :82.51                          
##  NA's   :706                        NA's   :189                            
##  Population.density..people.per.sq..km.of.land.area.
##  Min.   :    0.102                                  
##  1st Qu.:   17.443                                  
##  Median :   46.113                                  
##  Mean   :  260.380                                  
##  3rd Qu.:  119.047                                  
##  Max.   :20601.550                                  
##  NA's   :49                                         
##  Services..etc...value.added....of.GDP.      pop             continent        
##  Min.   : 10.07                         Min.   :6.534e+04   Length:2607       
##  1st Qu.: 40.45                         1st Qu.:3.134e+06   Class :character  
##  Median : 49.37                         Median :7.455e+06   Mode  :character  
##  Mean   : 50.62                         Mean   :3.331e+07                     
##  3rd Qu.: 60.96                         3rd Qu.:1.987e+07                     
##  Max.   :100.00                         Max.   :1.319e+09                     
##  NA's   :1186                           NA's   :1323                          
##    gdpPercap     
##  Min.   :   347  
##  1st Qu.:  1253  
##  Median :  4151  
##  Mean   :  8046  
##  3rd Qu.: 10994  
##  Max.   :109348  
##  NA's   :1323
summary(filtered_data_1962)
##        X        Country.Name            Year     
##  Min.   :   0   Length:259         Min.   :1962  
##  1st Qu.: 645   Class :character   1st Qu.:1962  
##  Median :1290   Mode  :character   Median :1962  
##  Mean   :1294                      Mean   :1962  
##  3rd Qu.:1945                      3rd Qu.:1962  
##  Max.   :2597                      Max.   :1962  
##                                                  
##  Agriculture..value.added....of.GDP. CO2.emissions..metric.tons.per.capita.
##  Min.   : 4.903                      Min.   : 0.00848                      
##  1st Qu.:33.994                      1st Qu.: 0.20073                      
##  Median :40.223                      Median : 0.65171                      
##  Mean   :40.028                      Mean   : 2.25427                      
##  3rd Qu.:45.904                      3rd Qu.: 1.94326                      
##  Max.   :94.846                      Max.   :42.63712                      
##  NA's   :210                         NA's   :64                            
##  Domestic.credit.provided.by.financial.sector....of.GDP.
##  Min.   : -1.513                                        
##  1st Qu.: 12.598                                        
##  Median : 21.096                                        
##  Mean   : 26.853                                        
##  3rd Qu.: 34.333                                        
##  Max.   :111.050                                        
##  NA's   :154                                            
##  Electric.power.consumption..kWh.per.capita.
##  Min.   : 111.8                             
##  1st Qu.:1336.9                             
##  Median :2006.8                             
##  Mean   :2467.4                             
##  3rd Qu.:2837.9                             
##  Max.   :9391.0                             
##  NA's   :228                                
##  Energy.use..kg.of.oil.equivalent.per.capita.
##  Min.   :  350.1                             
##  1st Qu.: 1567.7                             
##  Median : 2081.0                             
##  Mean   : 2527.1                             
##  3rd Qu.: 2938.8                             
##  Max.   :10414.5                             
##  NA's   :228                                 
##  Exports.of.goods.and.services....of.GDP.
##  Min.   :  3.518                         
##  1st Qu.:  9.283                         
##  Median : 15.948                         
##  Mean   : 21.706                         
##  3rd Qu.: 27.095                         
##  Max.   :138.181                         
##  NA's   :139                             
##  Fertility.rate..total..births.per.woman. GDP.growth..annual...
##  Min.   :1.790                            Min.   :-19.685      
##  1st Qu.:4.245                            1st Qu.:  3.296      
##  Median :6.065                            Median :  5.112      
##  Mean   :5.480                            Mean   :  5.066      
##  3rd Qu.:6.750                            3rd Qu.:  6.700      
##  Max.   :8.197                            Max.   : 24.521      
##  NA's   :23                               NA's   :135          
##  Imports.of.goods.and.services....of.GDP. Industry..value.added....of.GDP.
##  Min.   :  2.908                          Min.   : 3.52                   
##  1st Qu.: 11.683                          1st Qu.:14.14                   
##  Median : 19.248                          Median :20.40                   
##  Mean   : 24.223                          Mean   :21.90                   
##  3rd Qu.: 29.432                          3rd Qu.:30.43                   
##  Max.   :148.588                          Max.   :41.72                   
##  NA's   :139                              NA's   :215                     
##  Inflation..GDP.deflator..annual... Life.expectancy.at.birth..total..years.
##  Min.   : -7.9713                   Min.   :28.55                          
##  1st Qu.:  0.6961                   1st Qu.:44.82                          
##  Median :  2.3513                   Median :54.29                          
##  Mean   :  5.3079                   Mean   :54.26                          
##  3rd Qu.:  4.0644                   3rd Qu.:64.73                          
##  Max.   :178.6815                   Max.   :73.72                          
##  NA's   :148                        NA's   :23                             
##  Population.density..people.per.sq..km.of.land.area.
##  Min.   :    0.102                                  
##  1st Qu.:   10.832                                  
##  Median :   27.698                                  
##  Mean   :  180.998                                  
##  3rd Qu.:   87.879                                  
##  Max.   :11521.000                                  
##  NA's   :6                                          
##  Services..etc...value.added....of.GDP.      pop             continent        
##  Min.   : 15.42                         Min.   :    65345   Length:259        
##  1st Qu.: 35.03                         1st Qu.:  1852686   Class :character  
##  Median : 39.56                         Median :  4569171   Mode  :character  
##  Mean   : 41.44                         Mean   : 21536525                     
##  3rd Qu.: 44.72                         3rd Qu.: 10812216                     
##  Max.   :100.00                         Max.   :665770000                     
##  NA's   :213                            NA's   :131                           
##    gdpPercap      
##  Min.   :  355.2  
##  1st Qu.: 1056.2  
##  Median : 2510.7  
##  Mean   : 4958.9  
##  3rd Qu.: 6085.3  
##  Max.   :95458.1  
##  NA's   :131

Imputing the NA values for the columns of CO2 Emissions (metric tons per capita and GDP per Capita

filtered_data_1962_imputed <- filtered_data_1962 %>%
  mutate(
    "CO2 Emissions (metric tons per capita)" = ifelse(
      is.na("CO2 Emissions (metric tons per capita)"),
      mean("CO2 Emissions (metric tons per capita)", na.rm = TRUE),
      "CO2 Emissions (metric tons per capita)"
    ),
    gdpPercap = ifelse(
      is.na(gdpPercap),
      mean(gdpPercap, na.rm = TRUE),
      gdpPercap
    )
  )

2 Making a scatter plot with the modified dataset

p <- ggplot(filtered_data_1962_imputed, aes(x = `CO2.emissions..metric.tons.per.capita.`, y = gdpPercap, color = continent)) +
  geom_point() +
  labs(
    title = "Scatter Plot of CO2 Emissions vs. GDP per Capita (1962)",
    x = "CO2 Emissions (metric tons per capita)",
    y = "GDP per Capita"
  )

#applying a theme 

p + theme_classic()
## Warning: Removed 64 rows containing missing values (`geom_point()`).

3 Calculating the correlation in between CO2 emissions (metric tons per capita) and gdpPercap, and p-value.

correlation_result <- cor.test(filtered_data_1962_imputed$`CO2.emissions..metric.tons.per.capita.`,
  filtered_data_1962_imputed$gdpPercap,
  method = "pearson"
)

print(correlation_result)
## 
##  Pearson's product-moment correlation
## 
## data:  filtered_data_1962_imputed$CO2.emissions..metric.tons.per.capita. and filtered_data_1962_imputed$gdpPercap
## t = 14.441, df = 193, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.6455064 0.7819824
## sample estimates:
##       cor 
## 0.7206543

4 In what year is the correlation between ‘CO2 emissions (metric tons per capita)’ and gdpPercap the strongest?

# Finding the year with the strongest correlation

correlation_results <- lapply(unique(gapminder_data$Year), function(year) {
  subset_data <- gapminder_data %>% filter(Year == year)
  correlation_result <- cor.test(subset_data$`CO2.emissions..metric.tons.per.capita.`,
    subset_data$gdpPercap,
    method = "pearson"
  )
  return(data.frame(Year = year, correlation = correlation_result$estimate))
})

correlation_results_df <- do.call(rbind, correlation_results)
strongest_correlation_year <- correlation_results_df[which.max(correlation_results_df$correlation), "Year"]

# Filtering the data for the year of the strongest correlation

filtered_data_strongest_correlation <- gapminder_data %>% filter(Year == strongest_correlation_year)

5 Creating an interactive scatter plot with plotly for comparing ‘CO2 emissions (metric tons per capita)’ and gdpPercap.

# Creating a scatter-plot with ggplot2 with the filtered data from the previous step

gg_plot <- ggplot(filtered_data_strongest_correlation, aes(x = `CO2.emissions..metric.tons.per.capita.`, y = gdpPercap, size = pop, color = continent)) +
  geom_point() +
  labs(
    title = "Interactive Scatter Plot with Plotly",
    x = "CO2 Emissions (metric tons per capita)",
    y = "GDP per Capita"
  )

# Converting ggplot2 plot to plotly

plotly_plot <- ggplotly(gg_plot)

plotly_plot

6 Questions

  1. What’s the relationship in between the continent and ’ Energy use (kg of oil equivalent per capita)’ ?
# Checking the significancy with ANOVA test

anova_result <- aov(`Energy.use..kg.of.oil.equivalent.per.capita.` ~ continent, data = gapminder_data)

summary(anova_result)
##               Df    Sum Sq   Mean Sq F value Pr(>F)    
## continent      5 8.124e+08 162482656   21.88 <2e-16 ***
## Residuals   1404 1.043e+10   7426183                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 1197 observations deleted due to missingness

Commenting on the results that the analysis shows that there are significant differences in Energy use among the different continents.

Visualization:

# Interactive Boxplot

plot_ly(gapminder_data, x = ~continent, y = ~`Energy.use..kg.of.oil.equivalent.per.capita.`, type = "box") %>%
  layout(
    title = "Energy Use Across Continents",
    xaxis = list(title = "Continent"),
    yaxis = list(title = "Energy Use (kg of oil equivalent per capita)")
  )
## Warning: Ignoring 1197 observations
  1. Is there a significant difference between Europe and Asia with respect to ‘Imports of goods and services (% of GDP)’ in the years after 1990?
# Checking unique levels of the 'continent' variable
unique(gapminder_data$continent)
## [1] "Asia"     "Europe"   "Africa"   ""         "Americas" "Oceania"
# Filter data for non-empty continents and years after 1990
filtered_data_after_1990_cleaned <- gapminder_data %>%
  filter(Year > 1990, continent %in% c("Asia", "Europe", "Africa", "Americas", "Oceania"))

# Explicitly set factor levels for 'continent'
filtered_data_after_1990_cleaned$continent <- factor(filtered_data_after_1990_cleaned$continent, levels = c("Asia", "Europe"))

# Check unique levels of the 'continent' variable
unique(filtered_data_after_1990_cleaned$continent)
## [1] Asia   Europe <NA>  
## Levels: Asia Europe
# Using the t-test for to compare the means of "Import of goods and services (% of GDP)" between Europe and Asia after 1990

t_test_result <- t.test(`Imports.of.goods.and.services....of.GDP.` ~ continent, data = filtered_data_after_1990_cleaned)

t_test_result
## 
##  Welch Two Sample t-test
## 
## data:  Imports.of.goods.and.services....of.GDP. by continent
## t = 1.3552, df = 137.53, p-value = 0.1776
## alternative hypothesis: true difference in means between group Asia and group Europe is not equal to 0
## 95 percent confidence interval:
##  -2.321099 12.433240
## sample estimates:
##   mean in group Asia mean in group Europe 
##             46.84531             41.78924

#According the t-test results that there is no significant difference in the means of “Import of goods and services (% of GDP)” between Europe and Asia after 1990, the p-value is 0.1776, which is greater than the typical significance level of 0.05. Therefore, there is no enough evidence to reject the null hypothesis that true difference in means between Asia and Europe is equal to 0.

Visualization

# Interactive Bar Plot
plot_ly(filtered_data_after_1990_cleaned, x = ~continent, y = ~`Imports.of.goods.and.services....of.GDP.`, type = "bar") %>%
  layout(
    title = "Imports of Goods and Services (% of GDP) - Europe vs. Asia (After 1990)",
    xaxis = list(title = "Continent"),
    yaxis = list(title = "Imports (% of GDP)")
  )
## Warning: Ignoring 304 observations
  1. What is the country (or countries) that has the highest ‘Population density (people per sq. km of land area)’ across all years? (i.e., which country has the highest average ranking in this category across each time point in the dataset?)
# Since there is no need for a statistical test for this question, instead let's calculate the Population Density for each country across all years and identify the one with the highest average.

# Calculate average 'Population Density' for each country
average_population_density <- gapminder_data %>%
  group_by(Country.Name) %>%
  summarize(avg_population_density = mean(`Population.density..people.per.sq..km.of.land.area.`, na.rm = TRUE)) %>%
  arrange(desc(avg_population_density))

# Display the country with the highest average population density
head(average_population_density, 1)
## # A tibble: 1 × 2
##   Country.Name     avg_population_density
##   <chr>                             <dbl>
## 1 Macao SAR, China                 14732.

Visualization

# Interactive Bar Plot
plot_ly(average_population_density, x = ~Country.Name, y = ~avg_population_density, type = "bar") %>%
  layout(
    title = "Average Population Density Across All Years",
    xaxis = list(title = "Country"),
    yaxis = list(title = "Average Population Density")
  )
## Warning: Ignoring 1 observations
  1. What country (or countries) has shown the greatest increase in ‘Life expectancy at birth, total (years)’ between 1962 and 2007?
# Since there is no need for statistical test for this one, let's calculate the difference in "Life expectancy in between 1962 and 2007 for each country and identifying the one with he greatest increase.

# Calculate the difference in life expectancy between 1962 and 2007 for each country
life_expectancy_difference <- gapminder_data %>%
  filter(Year %in% c(1962, 2007)) %>%
  group_by(Country.Name) %>%
  summarize(difference = diff(`Life.expectancy.at.birth..total..years.`)) %>%
  arrange(desc(difference))
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
##   always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `summarise()` has grouped output by 'Country.Name'. You can override using the
## `.groups` argument.
# Display the country with the greatest increase in life expectancy
head(life_expectancy_difference, 1)
## # A tibble: 1 × 2
## # Groups:   Country.Name [1]
##   Country.Name difference
##   <chr>             <dbl>
## 1 Maldives           36.9

Visualization

# Create an interactive scatter plot
plot_ly(
  data = life_expectancy_difference, x = ~difference, y = ~Country.Name, type = "scatter", mode = "markers",
  marker = list(color = ~difference, colorscale = "Viridis"),
  text = ~ paste("Country: ", Country.Name, "<br>Life Expectancy Difference: ", round(difference, 2))
) %>%
  layout(
    title = "Difference in Life Expectancy (2007 - 1962) for Each Country",
    xaxis = list(title = "Life Expectancy Difference"),
    yaxis = list(title = "Country"),
    hovermode = "closest"
  )
## Warning: Ignoring 23 observations